Remote Sensing Image Analysis

Exploration

# download the data
dir.create('data', showWarnings = FALSE)
if (!file.exists('data/rs/samples.rds')) {
    download.file('https://biogeo.ucdavis.edu/data/rspatial/rsdata.zip', dest = 'data/rsdata.zip')
    unzip('data/rsdata.zip', exdir='data')
}

library(raster)
## Loading required package: sp
# Blue
b2 <- raster('data/rs/LC08_044034_20170614_B2.tif')
# Green
b3 <- raster('data/rs/LC08_044034_20170614_B3.tif')
# Red
b4 <- raster('data/rs/LC08_044034_20170614_B4.tif')
# Near Infrared (NIR)
b5 <- raster('data/rs/LC08_044034_20170614_B5.tif')

# NIR, Red, Green stack
s <- stack(b5, b4, b3)

filenames <- paste0('data/rs/LC08_044034_20170614_B', 1:11, ".tif")
landsat <- stack(filenames)
landsat
## class       : RasterStack 
## dimensions  : 1245, 1497, 1863765, 11  (nrow, ncol, ncell, nlayers)
## resolution  : 30, 30  (x, y)
## extent      : 594090, 639000, 4190190, 4227540  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=utm +zone=10 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0 
## names       : LC08_044034_20170614_B1, LC08_044034_20170614_B2, LC08_044034_20170614_B3, LC08_044034_20170614_B4, LC08_044034_20170614_B5, LC08_044034_20170614_B6, LC08_044034_20170614_B7, LC08_044034_20170614_B8, LC08_044034_20170614_B9, LC08_044034_20170614_B10, LC08_044034_20170614_B11 
## min values  :            9.641791e-02,            7.483990e-02,            4.259216e-02,            2.084067e-02,            8.457669e-04,           -7.872183e-03,           -5.052945e-03,            3.931751e-02,           -4.337332e-04,             2.897978e+02,             2.885000e+02 
## max values  :              0.73462820,              0.71775615,              0.69246972,              0.78617686,              1.01243150,              1.04320455,              1.11793602,              0.82673049,              0.03547901,             322.43139648,             317.99530029
# plotting example
par(mfrow = c(2,2))
plot(b2, main = "Blue", col = gray(0:100 / 100))
plot(b3, main = "Green", col = gray(0:100 / 100))
plot(b4, main = "Red", col = gray(0:100 / 100))
plot(b5, main = "NIR", col = gray(0:100 / 100))

# create stack from red, green blue
landsatRGB <- stack(b4, b3, b2)
# composite color plot, linearly stretching values
plotRGB(landsatRGB, axes = TRUE, stretch = "lin", main = "Landsat True Color Composite")

par(mfrow = c(1,2))
# RGB
plotRGB(landsatRGB, axes=TRUE, stretch="lin", main="Landsat True Color Composite")
# NIR, R, G
landsatFCC <- stack(b5, b4, b3)
plotRGB(landsatFCC, axes=TRUE, stretch="lin", main="Landsat False Color Composite")

Question 1: Use the plotRGB function with RasterStack landsat to create a true and false color composite (hint remember the position of the bands in the stack).

Landsat layers: Ultra Blue, Blue, Green, Red, Near Infrared (NIR), Shortwave Infrared (SWIR) 1, Shortwave Infrared (SWIR) 2, Panchromatic, Cirrus, Thermal Infrared (TIRS) 1, Thermal Infrared (TIRS) 2.

par(mfrow = c(1,2))
plotRGB(subset(landsat, 4:2), axes=TRUE, stretch="lin", main="Landsat True Color Composite")
plotRGB(subset(landsat, 5:3), axes=TRUE, stretch="lin", main="Landsat False Color Composite")

# only keep layers we need
landsat <- subset(landsat, 1:7)
# rename layers for clarity
names(landsat) <- c('ultra-blue', 'blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2')
extent(landsat)
## class       : Extent 
## xmin        : 594090 
## xmax        : 639000 
## ymin        : 4190190 
## ymax        : 4227540
# crop landsat to defined extent
e <- extent(624387, 635752, 4200047, 4210939)
landsatcrop <- crop(landsat, e)

Question 2: Interactive selection from the image is also possible. Use drawExtent and drawPoly to select an area of interest.

Question 3: Use the RasterStack landsatcrop to create a true and false color composite.

par(mfrow = c(1,2))
plotRGB(subset(landsatcrop, 4:2), axes=TRUE, stretch="lin", main="Landsat True Color Composite")
plotRGB(subset(landsatcrop, 5:3), axes=TRUE, stretch="lin", main="Landsat False Color Composite")

# example relationship between bands
pairs(landsatcrop[[1:2]], main = "Ultra-blue versus Blue")

# extracting pixel values example
# load the polygons with land use land cover information
samp <- readRDS('data/rs/samples.rds')
# generate 300 point samples from the polygons
ptsamp <- spsample(samp, 300, type='regular')
# add the land cover class to the points
ptsamp$class <- over(ptsamp, samp)$class
# extract values with points
df <- extract(landsat, ptsamp)
# To see some of the reflectance values
head(df)
##      ultra.blue      blue     green       red       NIR     SWIR1
## [1,]  0.1382727 0.1226585 0.1119020 0.1148080 0.1874793 0.2289438
## [2,]  0.1387282 0.1229838 0.1078900 0.1116418 0.1745542 0.2378352
## [3,]  0.1398342 0.1222465 0.1062419 0.1064804 0.1773734 0.2146741
## [4,]  0.1432173 0.1469256 0.1674627 0.2262329 0.3723128 0.3660888
## [5,]  0.1410269 0.1378607 0.1534749 0.2029634 0.3282677 0.3710116
## [6,]  0.1358439 0.1345861 0.1480100 0.1983008 0.3030246 0.3339712
##          SWIR2
## [1,] 0.1906021
## [2,] 0.2048718
## [3,] 0.1760505
## [4,] 0.2197704
## [5,] 0.2330425
## [6,] 0.2033971
# getting mean spectrum for different class types
ms <- aggregate(df, list(ptsamp$class), mean)
# instead of the first column, we use row names
rownames(ms) <- ms[,1]
ms <- ms[,-1]
# Create a vector of color for the land cover classes for use in plotting
mycolor <- c('darkred', 'yellow', 'burlywood', 'cyan', 'blue')
#transform ms from a data.frame to a matrix
ms <- as.matrix(ms)
# First create an empty plot
plot(0, ylim=c(0,0.6), xlim = c(1,7), type='n', xlab="Bands", ylab = "Reflectance")
# add the different classes
for (i in 1:nrow(ms)){
  lines(ms[i,], type = "l", lwd = 3, lty = 1, col = mycolor[i])
}
# Title
title(main="Spectral Profile from Landsat", font.main = 2)
# Legend
legend("topleft", rownames(ms),
       cex=0.8, col=mycolor, lty = 1, lwd =3, bty = "n")

Basic Mathematical Operations

# function to compute NDVI from raster stack with given band numbers for NIR and R
vi <- function(img, k, i) {
  bk <- img[[k]]
  bi <- img[[i]]
  vi <- (bk - bi) / (bk + bi)
  return(vi)
}

# For Landsat NIR = 5, red = 4.
ndvi <- vi(landsat, 5, 4)
plot(ndvi, col = rev(terrain.colors(10)), main = "Landsat-NDVI")

Question 1: Adapt the code shown above to compute indices to identify i) water and ii) built-up. Hint: Use the spectral profile plot to find the bands having maximum and minimum reflectance for these two classes.

# water has larger normalized difference between ultra blue and swir2 compared to other classes
wi = function(raster, ultra_blue, swir2){
  ub = raster[[ultra_blue]]
  s2 = raster[[swir2]]
  return((ub-s2)/(ub+s2))
}

# compute index for water
wi_ras = wi(landsat, 1, 7)
# only show values above threshold of 0.4
wi_ras_mask = wi_ras
wi_ras_mask[wi_ras < 0.4] = NA
# watery color ramp
pal = colorRampPalette(c("light blue", "blue"))
plot(wi_ras_mask, col=pal(10), main="Water Cover")

# built is trickier, doesn't have larger normalized difference between any two bands compared to other classes, but it does seem to have smallest normalized absolute difference between ultra blue and SWIR1
bi = function(raster, ultra_blue, swir_1){
  b1 = raster[[ultra_blue]]
  b2 = raster[[swir_1]]
  return(1 - abs(b1-b2)/(b1+b2))
}

bi_ras = bi(landsat, 1, 6)
bi_ras_mask = bi_ras
bi_ras_mask[bi_ras< 0.8] = NA
plot(bi_ras_mask, col=pal(10), main="Built Cover")

# view histogram of data
hist(ndvi,
     main = "Distribution of NDVI values",
     xlab = "NDVI",
     ylab= "Frequency",
     col = "wheat",
     xlim = c(-0.5, 1),
     breaks = 30,
     xaxt = 'n')
axis(side=1, at = seq(-0.5,1, 0.05), labels = seq(-0.5,1, 0.05))

Question 2: Make histograms of the values the vegetation indices developed in question 1.

hist(wi_ras,
     main = "Distribution of Water Index values",
     xlab = "Water Index",
     ylab= "Frequency",
     col = "blue",
     xlim = c(-0.4, 1),
     breaks = 30,
     xaxt = 'n')
axis(side=1, at = seq(-0.5,1, 0.05), labels = seq(-0.5,1, 0.05))

hist(bi_ras,
     main = "Distribution of Built Index values",
     xlab = "Built Index",
     ylab= "Frequency",
     col = "wheat",
     xlim = c(0, 1),
     breaks = 30,
     xaxt = 'n')
axis(side=1, at = seq(-0.5,1, 0.05), labels = seq(-0.5,1, 0.05))

Question 3: Is it possible to find water using thresholding of NDVI or any other indices?

The water index constructed above seems to identify water very well using the threshold of 0.4.

Unsupervised Classification

library(raster)
landsat5 <- stack('data/rs/centralvalley-2011LT5.tif')
names(landsat5) <- c('blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2')

Question 1: Make a 3-band False Color Composite plot of landsat5.

plotRGB(landsat5[[4:2]], stretch='lin', axes=TRUE, main='False Color Composite Image')

ndvi <- (landsat5[['NIR']] - landsat5[['red']]) / (landsat5[['NIR']] + landsat5[['red']])

# Extent to crop ndvi layer
e <- extent(-121.807, -121.725, 38.004, 38.072)
# crop landsat by the extent
ndvi <- crop(ndvi, e)
# convert raster to matrix
nr <- getValues(ndvi)

set.seed(99)
# We want to create 10 clusters, allow 500 iterations, start with 5 random sets using "Lloyd" method
kmncluster <- kmeans(na.omit(nr), centers = 10, iter.max = 500, nstart = 5, algorithm="Lloyd")
# kmeans returns an object of class "kmeans"
str(kmncluster)
## List of 9
##  $ cluster     : int [1:76608] 6 6 3 3 3 3 3 6 6 6 ...
##  $ centers     : num [1:10, 1] 0.38986 0.00498 0.29997 -0.14263 -0.20902 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "1" "2" "3" "4" ...
##   .. ..$ : NULL
##  $ totss       : num 6459
##  $ withinss    : num [1:10] 5.18 6.13 4.91 6.11 5.75 ...
##  $ tot.withinss: num 55.8
##  $ betweenss   : num 6403
##  $ size        : int [1:10] 8736 4550 7156 8624 11672 6807 8932 5198 5040 9893
##  $ iter        : int 221
##  $ ifault      : NULL
##  - attr(*, "class")= chr "kmeans"
# Use the ndvi object to set the cluster values to a new raster
knr <- setValues(ndvi, kmncluster$cluster)
# You can also do it like this
knr <- raster(ndvi)
values(knr) <- kmncluster$cluster
# Define a color vector for 10 clusters (learn more about setting the color later)
mycolor <- c("#fef65b","#ff0000", "#daa520","#0000ff","#0000ff","#00ff00","#cbbeb5",
             "#c3ff5b", "#ff7373", "#00ff00", "#808080")
par(mfrow = c(1,2))
plot(ndvi, col = rev(terrain.colors(10)), main = 'Landsat-NDVI')
plot(knr, main = 'Unsupervised classification', col = mycolor )

Question 2: Plot 3-band RGB of landsat5 for the subset (extent e) and result of kmeans clustering side-by-side and make a table of land-use land-cover labels for the clusters. E.g. cluster 4 and 5 are water.

par(mfrow=c(1,2))
plotRGB(crop(landsat5[[3:1]], e), stretch='lin', axes=TRUE, main='True Color Composite Image')
plot(knr, main='Unsupervised Classification', col=mycolor)

# first visually setting classes to land type and inspect...
class_colors = c('burlywood', 'darkred', 'cyan', 'blue', 'blue', 'cyan', 'yellow', 'darkred', 'yellow', 'burlywood')
class_names = c('fallow', 'built', 'open', 'water', 'water', 'open', 'cropland', 'built', 'cropland', 'fallow')

legend_colors = c('darkred', 'yellow', 'burlywood', 'cyan', 'blue')
legend_classes = c('built', 'cropland', 'fallow', 'open', 'water')

plot(knr, main='Unsupervised Classification', col=class_colors, xaxt='n', yaxt='n')
legend('left', legend=legend_classes, fill=legend_colors)

# table of clustered classes and corresponding land type
class_codes = 1:10
cbind(class_codes, class_names)
##       class_codes class_names
##  [1,] "1"         "fallow"   
##  [2,] "2"         "built"    
##  [3,] "3"         "open"     
##  [4,] "4"         "water"    
##  [5,] "5"         "water"    
##  [6,] "6"         "open"     
##  [7,] "7"         "cropland" 
##  [8,] "8"         "built"    
##  [9,] "9"         "cropland" 
## [10,] "10"        "fallow"

Supervised Classification

Load and clean data, make sample points and plot it:

library(raster)
nlcd <- brick('data/rs/nlcd-L1.tif')
names(nlcd) <- c("nlcd2001", "nlcd2011")
# The class names and colors for plotting
nlcdclass <- c("Water", "Developed", "Barren", "Forest", "Shrubland", "Herbaceous", "Planted/Cultivated", "Wetlands")
classdf <- data.frame(classvalue1 = c(1,2,3,4,5,7,8,9), classnames1 = nlcdclass)
# Hex codes of colors
classcolor <- c("#5475A8", "#B50000", "#D2CDC0", "#38814E", "#AF963C", "#D1D182", "#FBF65D", "#C8E6F8")
# Now we ratify (RAT = "Raster Attribute Table") the ncld2011 (define RasterLayer as a categorical variable). This is helpful for plotting.
nlcd2011 <- nlcd[[2]]
nlcd2011 <- ratify(nlcd2011)
rat <- levels(nlcd2011)[[1]]
#
rat$landcover <- nlcdclass
levels(nlcd2011) <- rat

set.seed(99)
# Sampling
samp2011 <- sampleStratified(nlcd2011, size = 200, na.rm = TRUE, sp = TRUE)

library(rasterVis)
## Loading required package: lattice
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
plt <- levelplot(nlcd2011, col.regions = classcolor, main = 'Distribution of Training Sites')
print(plt + layer(sp.points(samp2011, pch = 3, cex = 0.5, col = 1)))

Extract the landsat5 values from sample locations:

landsat5 <- stack('data/rs/centralvalley-2011LT5.tif')
names(landsat5) <- c('blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2')

# Extract the layer values for the locations
sampvals <- extract(landsat5, samp2011, df = TRUE)
# sampvals no longer has the spatial information. To keep the spatial information you use `sp=TRUE` argument in the `extract` function.
# drop the ID column
sampvals <- sampvals[, -1]
# combine the class information with extracted values
sampdata <- data.frame(classvalue = samp2011@data$nlcd2011, sampvals)

Make a Classification Tree Model:

library(rpart)
# Train the model
cart <- rpart(as.factor(classvalue)~., data=sampdata, method = 'class', minsplit = 5)
# print(model.class)
# Plot the trained classification tree
plot(cart, uniform=TRUE, main="Classification Tree")
text(cart, cex = 0.8)

pr2011 <- predict(landsat5, cart, type='class')

pr2011 <- ratify(pr2011)
rat <- levels(pr2011)[[1]]
rat$legend <- classdf$classnames
levels(pr2011) <- rat
levelplot(pr2011, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Decision Tree classification of Landsat 5")

Question 1: Plot nlcd2011 and pr2011 side-by-side and comment about the accuracy of the prediction (e.g. mixing between cultivated crops, pasture, grassland and shrubs).

library(gridExtra)
p1 = levelplot(nlcd2011, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Classification of Landsat 5")
p2 = levelplot(pr2011, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Decision Tree classification of Landsat 5")
grid.arrange(p1, p2, ncol=2)

As seen above, the decision tree classification does not perform very well overall. It does a decent job at classifying water, and it captures general macroscopic land cover variation, but overall most of the land area is misclassified. Though most of the eastern half is cropland, the classifier frequently mixed those classifications with forest, herbaceous, and shrubland.

Model Evaluation

library(dismo)
set.seed(99)
j <- kfold(sampdata, k = 5, by=sampdata$classvalue)

x <- list()
for (k in 1:5) {
    train <- sampdata[j!= k, ]
    test <- sampdata[j == k, ]
    cart <- rpart(as.factor(classvalue)~., data=train, method = 'class', minsplit = 5)
    pclass <- predict(cart, test, type='class')
    # create a data.frame using the reference and prediction
    x[[k]] <- cbind(test$classvalue, as.integer(pclass))
}

y <- do.call(rbind, x)
y <- data.frame(y)
colnames(y) <- c('observed', 'predicted')
conmat <- table(y)
# change the name of the classes
colnames(conmat) <- classdf$classnames
rownames(conmat) <- classdf$classnames
conmat
##                     predicted
## observed             Water Developed Barren Forest Shrubland Herbaceous
##   Water                172         6      1      3         3          1
##   Developed              8        93     22     10         8         37
##   Barren                 6        54     52      6         8         62
##   Forest                 0         3      2    102        49          8
##   Shrubland              0         8      1     52       105         12
##   Herbaceous             1        16     24      4        16        121
##   Planted/Cultivated     2        20      5     24        37         24
##   Wetlands              14        12      1     32        25         16
##                     predicted
## observed             Planted/Cultivated Wetlands
##   Water                               1       13
##   Developed                          20        2
##   Barren                              5        7
##   Forest                             13       23
##   Shrubland                          17        5
##   Herbaceous                         17        1
##   Planted/Cultivated                 81        7
##   Wetlands                           32       68

Question 2: Comment on the misclassification between different classes.

From the matrix, it is clear that water was classified most accurately. Water appears to have the lowest false positive rate and lowest false negative rate. Many of the false positive/false negative rates for the other classes are above 50%. Barren land had a particularly bad false negative rate: true barren land was classified as both developed and herbaceous more often than barren.

Question 3: Can you think of ways to to improve the accuracy?

The accuracy may be improved by using more training data, using more predictor variables (i.e. more raster bands or raster band derivatives), or by using another classification method entirely. For example, a convolutional neural net might perform better at identifying built structures due to its consideration of neighborhoods surrounding each pixel.

# number of cases
n <- sum(conmat)
# number of correctly classified cases per class
diag <- diag(conmat)
# Overall Accuracy
OA <- sum(diag) / n
OA
## [1] 0.49625
# observed (true) cases per class
rowsums <- apply(conmat, 1, sum)
p <- rowsums / n
# predicted cases per class
colsums <- apply(conmat, 2, sum)
q <- colsums / n
expAccuracy <- sum(p*q)
kappa <- (OA - expAccuracy) / (1 - expAccuracy)
kappa
## [1] 0.4242857
# Producer accuracy
PA <- diag / colsums
# User accuracy
UA <- diag / rowsums
outAcc <- data.frame(producerAccuracy = PA, userAccuracy = UA)
outAcc
##                    producerAccuracy userAccuracy
## Water                     0.8472906        0.860
## Developed                 0.4386792        0.465
## Barren                    0.4814815        0.260
## Forest                    0.4377682        0.510
## Shrubland                 0.4183267        0.525
## Herbaceous                0.4306050        0.605
## Planted/Cultivated        0.4354839        0.405
## Wetlands                  0.5396825        0.340

Question 4: Perform the classification using Random Forest classifiers from the randomForest package.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
rf = randomForest(as.factor(classvalue)~., data=sampdata, method='class', minsplit=5)

pr_rf <- predict(landsat5, rf, type='class')

Question 5: Plot the results of rpart and Random Forest classifier side-by-side.

pr_rf <- ratify(pr_rf)
rat <- levels(pr_rf)[[1]]
rat$legend <- classdf$classnames
levels(pr_rf) <- rat

p1 = levelplot(pr_rf, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Random Forest")
p2 = levelplot(pr2011, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Decision Tree")
grid.arrange(p1, p2, ncol=2)

Question 7 (optional): We have trained the classifiers using 200 samples for each class. Investigate the effect of sample size on classification. Repeat the steps with different subsets, e.g. a sample size of 150, 100, 50 per class, and compare the results. Use the same holdout samples for model evaluation.

# make subsets of sample data
ns = c(150, 100, 50)
classvals = c(1,2,3,4,5,7,8,9)
# get subset of n sample data values with classval
get_subset = function(classval, n){
  s = subset(sampdata, classvalue==classval)[1:n,]
  return(s)
}
# get n sample data values of each class
get_all_subs = function(n){
  s = lapply(classvals, FUN=get_subset, n=n)
  s = do.call("rbind", s)
  return(s)
}
# get test data (use remaining data held out from biggest subset)
get_test = function(classval){
  s = subset(sampdata, classvalue==classval)
  s = s[150:length(s),]
  return(s)
}
test = do.call("rbind", lapply(classvals, FUN=get_test))

subsets = lapply(ns, FUN=get_all_subs)

accuracies = c()
# make model for each subset and assess accuracy
for (subset in subsets){
  j <- kfold(subset, k = 5, by=subset$classvalue)
  x <- list()
  for (k in 1:5) {
      train <- subset[j!= k, ]
      cart <- rpart(as.factor(classvalue)~., data=train, method = 'class', minsplit = 5)
      pclass <- predict(cart, test, type='class')
      # create a data.frame using the reference and prediction
      x[[k]] <- cbind(test$classvalue, as.integer(pclass))
  }
  y <- do.call(rbind, x)
  y <- data.frame(y)
  colnames(y) <- c('observed', 'predicted')
  conmat <- table(y)
  # change the name of the classes
  colnames(conmat) <- classdf$classnames
  rownames(conmat) <- classdf$classnames
  # number of cases
  n <- sum(conmat)
  # number of correctly classified cases per class
  diag <- diag(conmat)
  # Overall Accuracy
  OA <- sum(diag) / n
  # save to output array
  accuracies = append(accuracies, OA)
}

data.frame(n=ns, accuracy=accuracies)
##     n  accuracy
## 1 150 0.5296875
## 2 100 0.5163194
## 3  50 0.4996528

From the computed overall accuracies above, we see that sample size does not have much of an effect for n > 50.